1   package org.apache.lucene.util.automaton;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.util.ArrayList;
21  import java.util.List;
22  
23  import org.apache.lucene.util.LuceneTestCase;
24  
25  import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
26  
27  public class TestLevenshteinAutomata extends LuceneTestCase {
28   
29    public void testLev0() throws Exception {
30      assertLev("", 0);
31      assertCharVectors(0);
32    }
33    
34    public void testLev1() throws Exception {
35      assertLev("", 1);
36      assertCharVectors(1);
37    }
38    
39    public void testLev2() throws Exception {
40      assertLev("", 2);
41      assertCharVectors(2);
42    }
43    
44    // LUCENE-3094
45    public void testNoWastedStates() throws Exception {
46      assertFalse(Operations.hasDeadStatesFromInitial(new LevenshteinAutomata("abc", false).toAutomaton(1)));
47    }
48    
49    /** 
50     * Tests all possible characteristic vectors for some n
51     * This exhaustively tests the parametric transitions tables.
52     */
53    private void assertCharVectors(int n) {
54      int k = 2*n + 1;
55      // use k + 2 as the exponent: the formula generates different transitions
56      // for w, w-1, w-2
57      int limit = (int) Math.pow(2, k + 2);
58      for (int i = 0; i < limit; i++) {
59        String encoded = Integer.toString(i, 2);
60        assertLev(encoded, n);
61      }
62    }
63  
64    /**
65     * Builds a DFA for some string, and checks all Lev automata
66     * up to some maximum distance.
67     */
68    private void assertLev(String s, int maxDistance) {
69      LevenshteinAutomata builder = new LevenshteinAutomata(s, false);
70      LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true);
71      Automaton automata[] = new Automaton[maxDistance + 1];
72      Automaton tautomata[] = new Automaton[maxDistance + 1];
73      for (int n = 0; n < automata.length; n++) {
74        automata[n] = builder.toAutomaton(n);
75        tautomata[n] = tbuilder.toAutomaton(n);
76        assertNotNull(automata[n]);
77        assertNotNull(tautomata[n]);
78        assertTrue(automata[n].isDeterministic());
79        assertTrue(tautomata[n].isDeterministic());
80        assertTrue(Operations.isFinite(automata[n]));
81        assertTrue(Operations.isFinite(tautomata[n]));
82        assertFalse(Operations.hasDeadStatesFromInitial(automata[n]));
83        assertFalse(Operations.hasDeadStatesFromInitial(tautomata[n]));
84        // check that the dfa for n-1 accepts a subset of the dfa for n
85        if (n > 0) {
86          assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]),
87                                         Operations.removeDeadStates(automata[n])));
88          assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]),
89                                         Operations.removeDeadStates(tautomata[n])));
90          assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]),
91                                         Operations.removeDeadStates(automata[n])));
92          assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]),
93                                         Operations.removeDeadStates(tautomata[n])));
94          assertNotSame(automata[n-1], automata[n]);
95        }
96        // check that Lev(N) is a subset of LevT(N)
97        assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n]),
98                                       Operations.removeDeadStates(tautomata[n])));
99        // special checks for specific n
100       switch(n) {
101         case 0:
102           // easy, matches the string itself
103           assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(automata[0])));
104           assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(tautomata[0])));
105           break;
106         case 1:
107           // generate a lev1 naively, and check the accepted lang is the same.
108           assertTrue(Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1])));
109           assertTrue(Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1])));
110           break;
111         default:
112           assertBruteForce(s, automata[n], n);
113           assertBruteForceT(s, tautomata[n], n);
114           break;
115       }
116     }
117   }
118   
119   /**
120    * Return an automaton that accepts all 1-character insertions, deletions, and
121    * substitutions of s.
122    */
123   private Automaton naiveLev1(String s) {
124     Automaton a = Automata.makeString(s);
125     a = Operations.union(a, insertionsOf(s));
126     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
127     a = Operations.union(a, deletionsOf(s));
128     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
129     a = Operations.union(a, substitutionsOf(s));
130     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
131     
132     return a;
133   }
134   
135   /**
136    * Return an automaton that accepts all 1-character insertions, deletions,
137    * substitutions, and transpositions of s.
138    */
139   private Automaton naiveLev1T(String s) {
140     Automaton a = naiveLev1(s);
141     a = Operations.union(a, transpositionsOf(s));
142     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
143     return a;
144   }
145   
146   /**
147    * Return an automaton that accepts all 1-character insertions of s (inserting
148    * one character)
149    */
150   private Automaton insertionsOf(String s) {
151     List<Automaton> list = new ArrayList<>();
152     
153     for (int i = 0; i <= s.length(); i++) {
154       Automaton a = Automata.makeString(s.substring(0, i));
155       a = Operations.concatenate(a, Automata.makeAnyChar());
156       a = Operations.concatenate(a, Automata.makeString(s.substring(i)));
157       list.add(a);
158     }
159     
160     Automaton a = Operations.union(list);
161     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
162     return a;
163   }
164   
165   /**
166    * Return an automaton that accepts all 1-character deletions of s (deleting
167    * one character).
168    */
169   private Automaton deletionsOf(String s) {
170     List<Automaton> list = new ArrayList<>();
171     
172     for (int i = 0; i < s.length(); i++) {
173       Automaton a = Automata.makeString(s.substring(0, i));
174       a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1)));
175       list.add(a);
176     }
177     
178     Automaton a = Operations.union(list);
179     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
180     return a;
181   }
182   
183   /**
184    * Return an automaton that accepts all 1-character substitutions of s
185    * (replacing one character)
186    */
187   private Automaton substitutionsOf(String s) {
188     List<Automaton> list = new ArrayList<>();
189     
190     for (int i = 0; i < s.length(); i++) {
191       Automaton a = Automata.makeString(s.substring(0, i));
192       a = Operations.concatenate(a, Automata.makeAnyChar());
193       a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1)));
194       list.add(a);
195     }
196     
197     Automaton a = Operations.union(list);
198     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
199     return a;
200   }
201   
202   /**
203    * Return an automaton that accepts all transpositions of s
204    * (transposing two adjacent characters)
205    */
206   private Automaton transpositionsOf(String s) {
207     if (s.length() < 2) {
208       return Automata.makeEmpty();
209     }
210     List<Automaton> list = new ArrayList<>();
211     for (int i = 0; i < s.length()-1; i++) {
212       StringBuilder sb = new StringBuilder();
213       sb.append(s.substring(0, i));
214       sb.append(s.charAt(i+1));
215       sb.append(s.charAt(i));
216       sb.append(s.substring(i+2, s.length()));
217       String st = sb.toString();
218       if (!st.equals(s)) {
219         list.add(Automata.makeString(st));
220       }
221     }
222     Automaton a = Operations.union(list);
223     a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
224     return a;
225   }
226   
227   private void assertBruteForce(String input, Automaton dfa, int distance) {
228     CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
229     int maxLen = input.length() + distance + 1;
230     int maxNum = (int) Math.pow(2, maxLen);
231     for (int i = 0; i < maxNum; i++) {
232       String encoded = Integer.toString(i, 2);
233       boolean accepts = ra.run(encoded);
234       if (accepts) {
235         assertTrue(getDistance(input, encoded) <= distance);
236       } else {
237         assertTrue(getDistance(input, encoded) > distance);
238       }
239     }
240   }
241   
242   private void assertBruteForceT(String input, Automaton dfa, int distance) {
243     CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
244     int maxLen = input.length() + distance + 1;
245     int maxNum = (int) Math.pow(2, maxLen);
246     for (int i = 0; i < maxNum; i++) {
247       String encoded = Integer.toString(i, 2);
248       boolean accepts = ra.run(encoded);
249       if (accepts) {
250         assertTrue(getTDistance(input, encoded) <= distance);
251       } else {
252         assertTrue(getTDistance(input, encoded) > distance);
253       }
254     }
255   }
256   
257   //*****************************
258   // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
259   //*****************************
260   private int getDistance (String target, String other) {
261     char[] sa;
262     int n;
263     int p[]; //'previous' cost array, horizontally
264     int d[]; // cost array, horizontally
265     int _d[]; //placeholder to assist in swapping p and d
266     
267       /*
268          The difference between this impl. and the previous is that, rather
269          than creating and retaining a matrix of size s.length()+1 by t.length()+1,
270          we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
271          is the 'current working' distance array that maintains the newest distance cost
272          counts as we iterate through the characters of String s.  Each time we increment
273          the index of String t we are comparing, d is copied to p, the second int[].  Doing so
274          allows us to retain the previous cost counts as required by the algorithm (taking
275          the minimum of the cost count to the left, up one, and diagonally up and to the left
276          of the current cost count being calculated).  (Note that the arrays aren't really
277          copied anymore, just switched...this is clearly much better than cloning an array
278          or doing a System.arraycopy() each time  through the outer loop.)
279 
280          Effectively, the difference between the two implementations is this one does not
281          cause an out of memory condition when calculating the LD over two very large strings.
282        */
283 
284       sa = target.toCharArray();
285       n = sa.length;
286       p = new int[n+1]; 
287       d = new int[n+1]; 
288     
289       final int m = other.length();
290       if (n == 0 || m == 0) {
291         if (n == m) {
292           return 0;
293         }
294         else {
295           return Math.max(n, m);
296         }
297       } 
298 
299 
300       // indexes into strings s and t
301       int i; // iterates through s
302       int j; // iterates through t
303 
304       char t_j; // jth character of t
305 
306       int cost; // cost
307 
308       for (i = 0; i<=n; i++) {
309           p[i] = i;
310       }
311 
312       for (j = 1; j<=m; j++) {
313           t_j = other.charAt(j-1);
314           d[0] = j;
315 
316           for (i=1; i<=n; i++) {
317               cost = sa[i-1]==t_j ? 0 : 1;
318               // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
319               d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);
320           }
321 
322           // copy current distance counts to 'previous row' distance counts
323           _d = p;
324           p = d;
325           d = _d;
326       }
327 
328       // our last action in the above loop was to switch d and p, so p now
329       // actually has the most recent cost counts
330       return Math.abs(p[n]);
331   }
332   
333   private int getTDistance(String target, String other) {
334     char[] sa;
335     int n;
336     int d[][]; // cost array
337 
338     sa = target.toCharArray();
339     n = sa.length;
340     final int m = other.length();
341     d = new int[n+1][m+1];
342     
343     if (n == 0 || m == 0) {
344       if (n == m) {
345         return 0;
346       }
347       else {
348         return Math.max(n, m);
349       }
350     } 
351 
352     // indexes into strings s and t
353     int i; // iterates through s
354     int j; // iterates through t
355 
356     char t_j; // jth character of t
357 
358     int cost; // cost
359 
360     for (i = 0; i<=n; i++) {
361       d[i][0] = i;
362     }
363     
364     for (j = 0; j<=m; j++) {
365       d[0][j] = j;
366     }
367 
368       for (j = 1; j<=m; j++) {
369           t_j = other.charAt(j-1);
370 
371           for (i=1; i<=n; i++) {
372               cost = sa[i-1]==t_j ? 0 : 1;
373               // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
374               d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
375               // transposition
376               if (i > 1 && j > 1 && target.charAt(i-1) == other.charAt(j-2) && target.charAt(i-2) == other.charAt(j-1)) {
377                 d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
378               }
379           }
380       }
381 
382       // our last action in the above loop was to switch d and p, so p now
383       // actually has the most recent cost counts
384       return Math.abs(d[n][m]);
385   }
386 }